notebook.community

Edit and run



In [1]:

    
%matplotlib inline
import pandas as pd

import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
from collections import Counter

import plotly as py
py.offline.init_notebook_mode()

#import colorlover as cl

from IPython.display import HTML, display



In [2]:

    
sns.set_context("poster")
sns.set_style("ticks")



In [3]:

    
TOPIC_MAPPING={
    "GunControl": "Gun Control",
    "Privacy": "Privacy",
    "Vaccine": "Vaccine",
    "ChildEducation": "Child Education",
    "SkinDamage": "Skin Damage",
    "SeatBelt": "Seat Belt"
}
topic_order=["Gun Control", "Privacy", "Vaccine",
             "Child Education", "Skin Damage", "Seat Belt"]
df = pd.read_hdf("FINAL_ANALYSIS_DATA.h5", "final_data").rename(columns={
        #u'is_controvertial': u'is_controversial'
    }).assign(
    topic_name=lambda x: x.topic_name.apply(lambda k: TOPIC_MAPPING[k.split('/')[0]]),
)
NON_STATES = set(["UNK", "USA", "AS", "DC", "GU",
              "MP", "PR", "VI"])



In [4]:

    
df.columns









    Out[4]:





Index([          u'Author',       u'City/Urban',  u'City/Urban Area',
               u'Contents',             u'Date',        u'Followers',
              u'Following',             u'GUID',           u'Gender',
                   u'Name',            u'Posts',              u'RT?',
           u'State/Region',              u'URL',             u'URL?',
              u'adjective',           u'adverb',      u'count_tweet',
               u'hashtag?',         u'mention?',         u'negation',
                   u'noun',      u'preposition',    u'processedPost',
              u'sentiment',   u'sentiment_subj', u'subjectvity_type',
                   u't_id',       u'topic_name',             u'verb',
              u't_created',       u't_retweets',      u't_favorites',
             u't_is_reply',       u't_is_quote',     u't_n_hashtags',
               u't_n_urls',     u't_n_mentions',        u't_n_media',
                   u'u_id',        u'u_created',       u'u_n_listed',
          u'u_n_favorites',    u'u_n_followers',      u'u_n_friends',
           u'u_n_statuses',    u'u_is_verified',       u'u_location',
                 u'u_name',            u'u_url', u'is_controversial',
                    u'TID',             u'CATS',          u'u_state'],
      dtype='object')



In [5]:

    
df.CATS.fillna(0).apply(
    lambda x: Counter(['UNK']) 
    if x == 0 
    else Counter(x)
).apply(lambda x: len(x)).describe()









    Out[5]:





count    246869.000000
mean          1.139163
std           0.356983
min           1.000000
25%           1.000000
50%           1.000000
75%           1.000000
max           5.000000
Name: CATS, dtype: float64



In [6]:

    
df["CATS_Counter"] = df.CATS.fillna(0).apply(
    lambda x: Counter(['NONE']) 
    if x == 0 
    else Counter(x)
)
df.ix[df.CATS_Counter.apply(lambda x: len(x)) == 2, "CATS_Counter"].head()









    



/home/content/anaconda3/envs/python2/lib/python2.7/site-packages/ipykernel/__main__.py:6: DeprecationWarning:


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix







    Out[6]:





23     {u'socialmedia': 1, u'videos': 1}
29    {u'twitter': 1, u'socialmedia': 1}
38     {u'socialmedia': 1, u'videos': 1}
53     {u'socialmedia': 1, u'videos': 1}
54    {u'twitter': 1, u'socialmedia': 1}
Name: CATS_Counter, dtype: object

Plotly plot



In [7]:

    
def get_string(x, cols):
    return "<br>".join("%s: %s" % (k.title(), x[k])
                       for k in cols)



In [8]:

    
scl = [[0.0, 'rgb(242,240,247)'],[0.2, 'rgb(218,218,235)'],[0.4, 'rgb(188,189,220)'],\
            [0.6, 'rgb(158,154,200)'],[0.8, 'rgb(117,107,177)'],[1.0, 'rgb(84,39,143)']]



In [9]:

    
df[df.u_state != "USA"].groupby("u_state")["is_controversial"].agg([np.mean, len, np.std]).reset_index()



In [10]:

    
def get_string(x):
    return "<br>".join("%s: %s" % (k.title(), x[k])
                       for k in x.index)

def plot_map(df, location_col, value_col, text_cols,
            scl="Portland", title="", cbar_title=""):
    data = [ dict(
            type='choropleth',
            colorscale = scl,
            autocolorscale = False,
            locations = df[location_col],
            z = df[value_col].astype(float),
            locationmode = 'USA-states',
            text = df[text_cols].apply(get_string, axis=1),
            marker = dict(
                line = dict (
                    color = 'rgb(255,255,255)',
                    width = 2
                ) ),
            colorbar = dict(
                title = cbar_title)
            ) ]

    layout = dict(
            title = '%s<br>(Hover for details)' % title,
            geo = dict(
                scope='usa',
                projection=dict( type='albers usa' ),
                showlakes = False),
                 )
    fig = dict( data=data, layout=layout )
    py.offline.iplot(fig, filename='d3-cloropleth-map')



In [11]:

    
scl = [[0.0, 'rgb(242,240,247)'],[0.2, 'rgb(218,218,235)'],[0.4, 'rgb(188,189,220)'],\
            [0.6, 'rgb(158,154,200)'],[0.8, 'rgb(117,107,177)'],[1.0, 'rgb(84,39,143)']]

df_t = df[df.u_state != "USA"].groupby("u_state")["is_controversial"].agg([np.mean, len, np.std]).reset_index()

plot_map(df_t,
         "u_state", "mean", ["u_state","len", "std"], scl='Portland',
         title="Proportion of controversial tweets per state",
         cbar_title="Proportion"
        )



In [12]:

    
df_t = df.assign(
    fakenews=df.CATS_Counter.apply(lambda x: x.get('fakenews', 0))
)[["u_state", "fakenews"]].groupby("u_state")["fakenews"].agg([np.mean, len, np.std]).reset_index()

plot_map(df_t,
         "u_state", "mean", ["u_state","len", "std"], scl='Portland',
         title="Proportion of fakenews urls per state",
         cbar_title="Proportion"
        )



In [13]:

    
for url_type in ["fakenews", "news", "blog"]:
    df_t = df[(df.u_state != "USA")
        & (df.t_n_urls > 0)].assign(**{
        url_type: lambda x: x.CATS_Counter.apply(lambda k: k.get(url_type, 0))}
    )[["u_state", url_type]].groupby("u_state")[url_type].agg([np.mean, len, np.std]).reset_index()

    plot_map(df_t[
            #(df_t["len"] > (df_t["len"].sum() * 0.01))
            (df_t["len"] >= (df_t["len"].sort_values().values[-10])) 
            & (~df_t["u_state"].isin(NON_STATES))
        ],
             "u_state", "mean", ["u_state","len", "std"], scl='Portland',
             title="Proportion of %s urls (in tweets with URLs) per state" % url_type.title(),
             cbar_title="Proportion"
            )

Split by topics



In [14]:

    
df.topic_name.value_counts()









    Out[14]:





Privacy            73593
Seat Belt          73270
Vaccine            40713
Gun Control        34357
Skin Damage        14128
Child Education    10808
Name: topic_name, dtype: int64

Fake News Maps



In [15]:

    
scl = [[0.0, 'rgb(242,240,247)'],[0.2, 'rgb(218,218,235)'],[0.4, 'rgb(188,189,220)'],\
            [0.6, 'rgb(158,154,200)'],[0.8, 'rgb(117,107,177)'],[1.0, 'rgb(84,39,143)']]
url_type = "fakenews"
for topic in topic_order:
    df_t = df[(df.u_state != "USA") 
              & (df.t_n_urls > 0)
              & (df.topic_name == topic)
             ].assign(
    fakenews=lambda x: x.CATS_Counter.apply(lambda k: k.get(url_type, 0))
    )[["u_state", url_type]].groupby("u_state")[url_type].agg([np.mean, len, np.std]).reset_index()
    df_t["value_rank"] = df_t["mean"].rank(ascending=False)
    plot_map(df_t[
            #(df_t["len"] > (df_t["len"].sum() * 0.01))
            (df_t["len"] >= (df_t["len"].sort_values().values[-10]))
            & (~df_t["u_state"].isin(NON_STATES))
            
        ],
         "u_state", "mean", ["u_state", "value_rank", "mean","len", "std"], scl="Portland",
         title=topic,
         cbar_title="Proportion"
        )

Blog Maps



In [16]:

    
scl = [[0.0, 'rgb(242,240,247)'],[0.2, 'rgb(218,218,235)'],[0.4, 'rgb(188,189,220)'],\
            [0.6, 'rgb(158,154,200)'],[0.8, 'rgb(117,107,177)'],[1.0, 'rgb(84,39,143)']]
url_type = "blog"
for topic in topic_order:
    df_t = df[(df.u_state != "USA") 
              & (df.t_n_urls > 0)
              & (df.topic_name == topic)
             ].assign(**{
    url_type: lambda x: x.CATS_Counter.apply(lambda k: k.get(url_type, 0))
    })[["u_state", url_type]].groupby("u_state")[url_type].agg([np.mean, len, np.std]).reset_index()
    df_t["value_rank"] = df_t["mean"].rank(ascending=False)
    plot_map(df_t[
            #(df_t["len"] > (df_t["len"].sum() * 0.01))
            (df_t["len"] >= (df_t["len"].sort_values().values[-10]))
            & (~df_t["u_state"].isin(NON_STATES))
            
        ],
         "u_state", "mean", ["u_state", "value_rank", "mean","len", "std"], scl="Portland",
         title=topic,
         cbar_title="Proportion"
        )

News Maps



In [17]:

    
scl = [[0.0, 'rgb(242,240,247)'],[0.2, 'rgb(218,218,235)'],[0.4, 'rgb(188,189,220)'],\
            [0.6, 'rgb(158,154,200)'],[0.8, 'rgb(117,107,177)'],[1.0, 'rgb(84,39,143)']]
url_type = "news"
for topic in topic_order:
    df_t = df[(df.u_state != "USA") 
              & (df.t_n_urls > 0)
              & (df.topic_name == topic)
             ].assign(**{
    url_type: lambda x: x.CATS_Counter.apply(lambda k: k.get(url_type, 0))
    })[["u_state", url_type]].groupby("u_state")[url_type].agg([np.mean, len, np.std]).reset_index()
    df_t["value_rank"] = df_t["mean"].rank(ascending=False)
    plot_map(df_t[
            #(df_t["len"] > (df_t["len"].sum() * 0.01))
            (df_t["len"] >= (df_t["len"].sort_values().values[-10]))
            & (~df_t["u_state"].isin(NON_STATES))
            
        ],
         "u_state", "mean", ["u_state", "value_rank", "mean","len", "std"], scl="Portland",
         title=topic,
         cbar_title="Proportion"
        )



In [18]:

    
def plot_map_subplots(df, geo_key, topic, location_col, value_col, text_cols,
            scl="Portland", cbar_title=""):
    data = [dict(
            type='choropleth',
            colorscale = scl,
            geo=geo_key,
            autocolorscale = False,
            showscale = False,
            locations = df[location_col],
            z = df[value_col].astype(float),
            locationmode = 'USA-states',
            text = df[text_cols].apply(get_string, axis=1),
            marker = dict(
                line = dict (
                    color = 'rgb(255,255,255)',
                    width = 2
                ) ),
            #colorbar = dict(
            #    title = cbar_title)
            ),
            dict(
                type = 'scattergeo',
                showlegend = False,
                lon = [-82],
                lat = [50],
                geo = geo_key,
                text = [topic],
                mode = 'text',
        )
    ]
    layout = dict(
                scope='usa',
                projection=dict( type='albers usa' ),
                showlakes = False,
        domain=dict(x=[], y=[])
    )
    return data, layout



In [19]:

    
data = []
COLS = 3
ROWS = 2

url_type="fakenews"
layout = dict(
    title = '%s URL proportions per state' % url_type.title(),
    # showlegend = False,
    autosize = False,
    width = 900,
    height = 400,
    hovermode = False,)

for i, topic in enumerate(topic_order):
    geo_key="geo%s" % (i+1) if i != 0 else "geo" # Important to index geo with i+1 rather than i
    x = i % COLS
    y = i / COLS
    df_t = df[(df.u_state != "USA") 
              & (df.t_n_urls > 0)
              & (df.topic_name == topic)
             ].assign(**{
    url_type: lambda x: x.CATS_Counter.apply(lambda k: k.get(url_type, 0))}
    )[["u_state", url_type]].groupby("u_state")[url_type].agg([np.mean, len, np.std]).reset_index()
    data_t, layout_t = plot_map_subplots(
        df_t[(df_t["len"] >= (df_t["len"].sort_values().values[-10]))
             & (~df_t["u_state"].isin(NON_STATES))
            ], geo_key, topic,
         "u_state", "mean", ["u_state","len", "std"], scl='Portland',
         cbar_title="Proportion"
        )
    data.extend(data_t)
    layout[geo_key] = layout_t
    layout[geo_key]['domain']['x'] = [float(x)/float(COLS), float(x+1)/float(COLS)]
    layout[geo_key]['domain']['y'] = [float(y)/float(ROWS), float(y+1)/float(ROWS)]
    print geo_key, x, y, layout[geo_key]["domain"]
    
fig = dict(data=data, layout=layout)
py.offline.iplot(fig, filename='d3-cloropleth-map')









    



geo 0 0 {'y': [0.0, 0.5], 'x': [0.0, 0.3333333333333333]}
geo2 1 0 {'y': [0.0, 0.5], 'x': [0.3333333333333333, 0.6666666666666666]}
geo3 2 0 {'y': [0.0, 0.5], 'x': [0.6666666666666666, 1.0]}
geo4 0 1 {'y': [0.5, 1.0], 'x': [0.0, 0.3333333333333333]}
geo5 1 1 {'y': [0.5, 1.0], 'x': [0.3333333333333333, 0.6666666666666666]}
geo6 2 1 {'y': [0.5, 1.0], 'x': [0.6666666666666666, 1.0]}



In [20]:

    
data = []
COLS = 3
ROWS = 2

url_type="blog"
layout = dict(
    title = '%s URL proportions per state' % url_type.title(),
    # showlegend = False,
    autosize = False,
    width = 900,
    height = 400,
    hovermode = False,)

for i, topic in enumerate(topic_order):
    geo_key="geo%s" % (i+1) if i != 0 else "geo" # Important to index geo with i+1 rather than i
    x = i % COLS
    y = i / COLS
    df_t = df[(df.u_state != "USA") 
              & (df.t_n_urls > 0)
              & (df.topic_name == topic)
             ].assign(**{
    url_type: lambda x: x.CATS_Counter.apply(lambda k: k.get(url_type, 0))}
    )[["u_state", url_type]].groupby("u_state")[url_type].agg([np.mean, len, np.std]).reset_index()
    data_t, layout_t = plot_map_subplots(
        df_t[(df_t["len"] >= (df_t["len"].sort_values().values[-10]))
             & (~df_t["u_state"].isin(NON_STATES))
            ], geo_key, topic,
         "u_state", "mean", ["u_state","len", "std"], scl='Portland',
         cbar_title="Proportion"
        )
    data.extend(data_t)
    layout[geo_key] = layout_t
    layout[geo_key]['domain']['x'] = [float(x)/float(COLS), float(x+1)/float(COLS)]
    layout[geo_key]['domain']['y'] = [float(y)/float(ROWS), float(y+1)/float(ROWS)]
    print geo_key, x, y, layout[geo_key]["domain"]
    
fig = dict(data=data, layout=layout)
py.offline.iplot(fig, filename='d3-cloropleth-map')









    



geo 0 0 {'y': [0.0, 0.5], 'x': [0.0, 0.3333333333333333]}
geo2 1 0 {'y': [0.0, 0.5], 'x': [0.3333333333333333, 0.6666666666666666]}
geo3 2 0 {'y': [0.0, 0.5], 'x': [0.6666666666666666, 1.0]}
geo4 0 1 {'y': [0.5, 1.0], 'x': [0.0, 0.3333333333333333]}
geo5 1 1 {'y': [0.5, 1.0], 'x': [0.3333333333333333, 0.6666666666666666]}
geo6 2 1 {'y': [0.5, 1.0], 'x': [0.6666666666666666, 1.0]}



In [21]:

    
data = []
COLS = 3
ROWS = 2

url_type="news"
layout = dict(
    title = '%s URL proportions per state' % url_type.title(),
    # showlegend = False,
    autosize = False,
    width = 900,
    height = 400,
    hovermode = False,)

for i, topic in enumerate(topic_order):
    geo_key="geo%s" % (i+1) if i != 0 else "geo" # Important to index geo with i+1 rather than i
    x = i % COLS
    y = i / COLS
    df_t = df[(df.u_state != "USA") 
              & (df.t_n_urls > 0)
              & (df.topic_name == topic)
             ].assign(**{
    url_type: lambda x: x.CATS_Counter.apply(lambda k: k.get(url_type, 0))}
    )[["u_state", url_type]].groupby("u_state")[url_type].agg([np.mean, len, np.std]).reset_index()
    data_t, layout_t = plot_map_subplots(
        df_t[(df_t["len"] >= (df_t["len"].sort_values().values[-10]))
             & (~df_t["u_state"].isin(NON_STATES))
            ], geo_key, topic,
         "u_state", "mean", ["u_state","len", "std"], scl='Portland',
         cbar_title="Proportion"
        )
    data.extend(data_t)
    layout[geo_key] = layout_t
    layout[geo_key]['domain']['x'] = [float(x)/float(COLS), float(x+1)/float(COLS)]
    layout[geo_key]['domain']['y'] = [float(y)/float(ROWS), float(y+1)/float(ROWS)]
    print geo_key, x, y, layout[geo_key]["domain"]
    
fig = dict(data=data, layout=layout)
py.offline.iplot(fig, filename='d3-cloropleth-map')









    



geo 0 0 {'y': [0.0, 0.5], 'x': [0.0, 0.3333333333333333]}
geo2 1 0 {'y': [0.0, 0.5], 'x': [0.3333333333333333, 0.6666666666666666]}
geo3 2 0 {'y': [0.0, 0.5], 'x': [0.6666666666666666, 1.0]}
geo4 0 1 {'y': [0.5, 1.0], 'x': [0.0, 0.3333333333333333]}
geo5 1 1 {'y': [0.5, 1.0], 'x': [0.3333333333333333, 0.6666666666666666]}
geo6 2 1 {'y': [0.5, 1.0], 'x': [0.6666666666666666, 1.0]}



In [22]:

    
df_topics = {}
for topic in topic_order:
    df_t = df[(df.u_state != "USA") 
              & (df.t_n_urls > 0)
              & (df.topic_name == topic)
             ].assign(
    fakenews=lambda x: x.CATS_Counter.apply(lambda k: k.get('fakenews', 0))
    )[["u_state", "fakenews"]].groupby("u_state")["fakenews"].agg([np.mean, len, np.std]).reset_index()
    df_t["value_rank"] = df_t["mean"].rank(ascending=False)
    df_topics[topic] = (df_t[
            (df_t["len"] >= (df_t["len"].sort_values().values[-10]))
            #(df_t["len"] > (df_t["len"].sum() * 0.01)
        ].sort_values("mean",
                                          ascending=False).reset_index().apply(
            lambda x: "%s (%.2f) [%s]" % (
                x["u_state"], x["mean"], x["len"]), axis=1))
pd.concat(df_topics, axis=1, keys=topic_order)









    Out[22]:







  
    
      
      Gun Control
      Privacy
      Vaccine
      Child Education
      Skin Damage
      Seat Belt
    
  
  
    
      0
      VA (0.18) [330]
      FL (0.07) [1252]
      FL (0.15) [745]
      DC (0.01) [154]
      IL (0.01) [236]
      TX (0.01) [759]
    
    
      1
      FL (0.18) [707]
      IL (0.07) [742]
      OH (0.14) [413]
      CA (0.01) [627]
      OH (0.01) [183]
      OH (0.01) [257]
    
    
      2
      TX (0.18) [938]
      DC (0.06) [1846]
      TX (0.13) [978]
      IL (0.01) [173]
      CA (0.00) [871]
      CA (0.01) [1404]
    
    
      3
      GA (0.13) [339]
      PA (0.06) [594]
      GA (0.08) [436]
      NY (0.00) [402]
      AZ (0.00) [236]
      GA (0.01) [356]
    
    
      4
      PA (0.12) [312]
      TX (0.06) [1526]
      CA (0.08) [3507]
      FL (0.00) [258]
      FL (0.00) [489]
      FL (0.01) [661]
    
    
      5
      IL (0.12) [421]
      NY (0.05) [2563]
      NY (0.08) [1660]
      GA (0.00) [138]
      GA (0.00) [275]
      PA (0.01) [346]
    
    
      6
      DC (0.12) [429]
      CA (0.05) [3165]
      PA (0.07) [432]
      MO (0.00) [127]
      NY (0.00) [636]
      NY (0.01) [965]
    
    
      7
      CA (0.11) [1530]
      VA (0.04) [715]
      DC (0.06) [385]
      NJ (0.00) [123]
      TX (0.00) [451]
      IL (0.01) [329]
    
    
      8
      NY (0.09) [1154]
      MA (0.04) [607]
      MA (0.06) [447]
      PA (0.00) [142]
      NC (0.00) [211]
      WA (0.00) [305]
    
    
      9
      WA (0.06) [394]
      WA (0.03) [1050]
      WA (0.06) [736]
      TX (0.00) [271]
      PA (0.00) [216]
      MI (0.00) [258]



In [23]:

    
fig, ax = plt.subplots(1,1,figsize=(15,5))
with sns.plotting_context(
    rc={"axes.titlesize": 14,
        "axes.labelsize": 14,
        "xtick.labelsize": 12,
        "ytick.labelsize": 14,
       }), sns.axes_style(
    rc={"font.family": "monospace"}):
    g = sns.barplot(y="is_controversial", x="u_state",
                    errwidth=2,
                data=df[~df.u_state.isin(NON_STATES)].sort_values("u_state"),
               ax=ax, color="0.7")
    ax.axhline(y=0.5, linestyle='--', color="k", lw=1.)
    ax.set_ylabel("Proportion of controversial tweets")
    ax.set_xlabel("US States")
    #ax.tick_params(axis='x', which='major', labelsize=10)
    sns.despine(offset=10)









    



/homed/content/anaconda3/envs/python2/lib/python2.7/site-packages/matplotlib/font_manager.py:1297: UserWarning:

findfont: Font family [u'sans-serif'] not found. Falling back to DejaVu Sans



In [24]:

    
LOCATION_ORDER = (["UNK", "USA"] + sorted(set(
            df.u_state.fillna("UNK").value_counts().index
    ) - NON_STATES)+ sorted(["AS", "DC", "GU",
              "MP", "PR", "VI"]))
colors = ["b"] * 2 + ["r"]*50 +["0.7"]*6
fig, ax = plt.subplots(1,1,figsize=(16,5))
with sns.plotting_context(
    rc={"axes.titlesize": 14,
        "axes.labelsize": 14,
        "xtick.labelsize": 12,
        "ytick.labelsize": 14,
       }), sns.axes_style(
    rc={"font.family": "monospace"}):
    g = sns.barplot(y="is_controversial", x="u_state",
                    errwidth=2,
                data=df.assign(u_state=df.u_state.fillna("UNK")),
               ax=ax, color="r", order=LOCATION_ORDER)
    ax.axhline(y=0.5, linestyle='--', color="k", lw=1.)
    ax.set_ylabel("Proportion of controversial tweets")
    ax.set_xlabel("US States")
    #ax.tick_params(axis='x', which='major', labelsize=10)
    [ax.patches[i].set_color(c) for i, c in enumerate(colors)]
    sns.despine(offset=10)
    plt.setp(ax.get_xticklabels()[:3], rotation=90)



In [25]:

    
LOCATION_ORDER = (["UNK", "USA"] + sorted(set(
            df.u_state.fillna("UNK").value_counts().index
    ) - NON_STATES)+ sorted(["AS", "DC", "GU",
              "MP", "PR", "VI"]))
colors = ["b"] * 2 + ["r"]*50 +["k"]*6
total_controversial = df[(df.is_controversial == 1) & (~df.u_state.isin(NON_STATES))].shape[0] * 1.
fig, ax = plt.subplots(1,1,figsize=(16,5))
with sns.plotting_context(
    rc={"axes.titlesize": 14,
        "axes.labelsize": 14,
        "xtick.labelsize": 12,
        "ytick.labelsize": 14,
       }), sns.axes_style(
    rc={"font.family": "monospace"}):
    g = sns.barplot(y="is_controversial", x="u_state",
                    
                data=df[
            (df.is_controversial == 1)
            & (~df.u_state.isin(NON_STATES))
        ],
               ax=ax, color="0.5",
                    order=LOCATION_ORDER[2:-6],
                    ci=None, estimator=lambda x: len(x)/total_controversial)
    #ax.axhline(y=0.5, linestyle='--', color="k", lw=1.)
    ax.set_ylabel("Distribution of controversial tweets\nacross states")
    ax.set_xlabel("US States")
    #ax.tick_params(axis='x', which='major', labelsize=10)
    #[ax.patches[i].set_color(c) for i, c in enumerate(colors)]
    sns.despine(offset=10)
    #plt.setp(ax.get_xticklabels()[:3], rotation=90)



In [26]:

    
df_t = df[(~df.u_state.isin(NON_STATES)) & (~df.u_state.isnull())].pivot_table(
    index="u_state", columns="topic_name", values="t_id", aggfunc=len)
with sns.plotting_context(
    rc={"axes.titlesize": 10,
        "axes.labelsize": 10,
        "xtick.labelsize": 10,
        "ytick.labelsize": 10,
       }), sns.axes_style(
    rc={"font.family": "monospace"}):
    g = sns.PairGrid(df_t.divide(df_t.sum(axis=0), axis=1).reset_index(),
                     x_vars=topic_order, y_vars=["u_state"],
                     size=10, aspect=.25)
    g.map(sns.stripplot, size=10, orient="h",
          color="k", edgecolor="gray")

    # Use the same x axis limits on all columns and add better labels
    g.set(xlabel="proportion", ylabel="",)

    # Use semantically meaningful titles for the columns
    titles = topic_order

    for ax, title in zip(g.axes.flat, titles):

        # Set a different title for each axes
        ax.set(title=title)

        # Make the grid horizontal instead of vertical
        ax.xaxis.grid(False)
        ax.yaxis.grid(True)

    sns.despine(left=True, bottom=True)

# Draw a dot plot using the stripplot function



In [27]:

    
LOCATION_ORDER = (["UNK", "USA"] + sorted(set(
            df.u_state.fillna("UNK").value_counts().index
    ) - NON_STATES)+ sorted(["AS", "DC", "GU",
              "MP", "PR", "VI"]))
colors = ["b"] * 2 + ["r"]*50 +["0.7"]*6
with sns.plotting_context(
    rc={"axes.titlesize": 14,
        "axes.labelsize": 14,
        "xtick.labelsize": 12,
        "ytick.labelsize": 14,
       }), sns.axes_style(
    rc={"font.family": "monospace"}):
    fig, ax = plt.subplots(1,1, figsize=(20,5))
    ax = sns.countplot(df.u_state.fillna("UNK"), color='k', ax=ax, 
                      order=LOCATION_ORDER)
    ax.set_yscale('log')
    ax.set_ylabel('Frequency')
    ax.set_xlabel('Tweet author location')
    plt.xticks(rotation='vertical')
    #sns.despine(offset=2)
    [ax.patches[i].set_color(c) for i, c in enumerate(colors)]



In [28]:

    
pd.concat([pd.DataFrame(k.reset_index().values, columns=["Location", "Counts"])
    for k in np.array_split(df.u_state.fillna("UNK").value_counts(), 4, axis=0)], axis=1)



In [29]:

    
df.u_state.describe()









    Out[29]:





count     169038
unique        57
top           CA
freq       22123
Name: u_state, dtype: object



In [30]:

    
df.u_state.shape









    Out[30]:





(246869,)



In [31]:

    
df.groupby("u_id")["u_state"].first().shape, df.groupby("u_id")["u_state"].first().describe()









    Out[31]:





((151073,), count     107970
 unique        57
 top           CA
 freq       13251
 Name: u_state, dtype: object)



In [ ]:

	u_state	mean	len	std
0	AK	0.618557	291	0.486578
1	AL	0.587302	1449	0.492489
2	AR	0.634062	869	0.481970
3	AS	0.300000	10	0.483046
4	AZ	0.573411	3242	0.494658
5	CA	0.654161	22123	0.475652
6	CO	0.617085	2669	0.486189
7	CT	0.594237	1284	0.491230
8	DC	0.822330	5150	0.382272
9	DE	0.543796	274	0.498990
10	FL	0.599461	8913	0.490035
11	GA	0.549006	4377	0.497649
12	GU	0.000000	5	0.000000
13	HI	0.555777	502	0.497375
14	IA	0.562099	934	0.496395
15	ID	0.611529	399	0.488015
16	IL	0.567998	5331	0.495401
17	IN	0.526071	3222	0.499397
18	KS	0.394879	1484	0.488989
19	KY	0.572368	1368	0.494916
20	LA	0.468478	1951	0.499133
21	MA	0.632763	3913	0.482114
22	MD	0.556561	2431	0.496893
23	ME	0.650817	673	0.477066
24	MI	0.574339	3141	0.494522
25	MN	0.572537	1675	0.494858
26	MO	0.564935	2002	0.495889
27	MP	1.000000	2	0.000000
28	MS	0.567430	786	0.495748
29	MT	0.674740	289	0.469284
30	NC	0.612668	3568	0.487209
31	ND	0.456250	160	0.499646
32	NE	0.458272	683	0.498621
33	NH	0.682051	585	0.466078
34	NJ	0.589065	3402	0.492076
35	NM	0.636574	432	0.481544
36	NV	0.588895	1693	0.492180
37	NY	0.656886	14689	0.474765
38	OH	0.580961	4601	0.493455
39	OK	0.555787	1443	0.497050
40	OR	0.651773	2171	0.476518
41	PA	0.606705	4653	0.488534
42	PR	0.625000	24	0.494535
43	RI	0.600000	455	0.490437
44	SC	0.573099	1539	0.494788
45	SD	0.405063	237	0.491943
46	TN	0.514493	2622	0.499885
47	TX	0.569604	11666	0.495153
48	UT	0.482531	1059	0.499931
49	VA	0.639094	3796	0.480327
50	VI	0.000000	5	0.000000
51	VT	0.715827	278	0.451833
52	WA	0.701355	5093	0.457709
53	WI	0.624379	1813	0.484416
54	WV	0.556923	325	0.497515
55	WY	0.606936	173	0.489849

	Location	Counts	Location	Counts	Location	Counts	Location	Counts
0	UNK	77831	NJ	3402	AL	1449	ID	399
1	CA	22123	AZ	3242	OK	1443	WV	325
2	USA	21114	IN	3222	KY	1368	AK	291
3	NY	14689	MI	3141	CT	1284	MT	289
4	TX	11666	CO	2669	UT	1059	VT	278
5	FL	8913	TN	2622	IA	934	DE	274
6	IL	5331	MD	2431	AR	869	SD	237
7	DC	5150	OR	2171	MS	786	WY	173
8	WA	5093	MO	2002	NE	683	ND	160
9	PA	4653	LA	1951	ME	673	PR	24
10	OH	4601	WI	1813	NH	585	AS	10
11	GA	4377	NV	1693	HI	502	GU	5
12	MA	3913	MN	1675	RI	455	VI	5
13	VA	3796	SC	1539	NM	432	MP	2
14	NC	3568	KS	1484	NaN	NaN	NaN	NaN

	Gun Control	Privacy	Vaccine	Child Education	Skin Damage	Seat Belt
0	VA (0.18) [330]	FL (0.07) [1252]	FL (0.15) [745]	DC (0.01) [154]	IL (0.01) [236]	TX (0.01) [759]
1	FL (0.18) [707]	IL (0.07) [742]	OH (0.14) [413]	CA (0.01) [627]	OH (0.01) [183]	OH (0.01) [257]
2	TX (0.18) [938]	DC (0.06) [1846]	TX (0.13) [978]	IL (0.01) [173]	CA (0.00) [871]	CA (0.01) [1404]
3	GA (0.13) [339]	PA (0.06) [594]	GA (0.08) [436]	NY (0.00) [402]	AZ (0.00) [236]	GA (0.01) [356]
4	PA (0.12) [312]	TX (0.06) [1526]	CA (0.08) [3507]	FL (0.00) [258]	FL (0.00) [489]	FL (0.01) [661]
5	IL (0.12) [421]	NY (0.05) [2563]	NY (0.08) [1660]	GA (0.00) [138]	GA (0.00) [275]	PA (0.01) [346]
6	DC (0.12) [429]	CA (0.05) [3165]	PA (0.07) [432]	MO (0.00) [127]	NY (0.00) [636]	NY (0.01) [965]
7	CA (0.11) [1530]	VA (0.04) [715]	DC (0.06) [385]	NJ (0.00) [123]	TX (0.00) [451]	IL (0.01) [329]
8	NY (0.09) [1154]	MA (0.04) [607]	MA (0.06) [447]	PA (0.00) [142]	NC (0.00) [211]	WA (0.00) [305]
9	WA (0.06) [394]	WA (0.03) [1050]	WA (0.06) [736]	TX (0.00) [271]	PA (0.00) [216]	MI (0.00) [258]